# Load data

zip_cbsa_data <- read_csv(url('https://raw.githubusercontent.com/cyouh95/third-way-report/master/assets/data/zip_code_cbsa.csv'))
hs_data <- read_csv(url('https://github.com/cyouh95/third-way-report/blob/master/assets/data/hs_data.csv?raw=true'), col_types = c('zip_code' = 'c'))

ceeb_nces <- read_csv(url('https://github.com/mpatricia01/public_requests_eda/raw/main/data/ceeb_nces_crosswalk.csv'))
cds_nces <- read_csv(url('https://github.com/mpatricia01/public_requests_eda/raw/main/data/CDS_NCES_crosswalk.csv')) %>% 
  mutate(ncessch = str_c(NCESDist, NCESSchool))

load(url('https://github.com/mpatricia01/public_requests_eda/raw/main/data/145637_orders.RData'))
# Contains: IL_orders, OOS_orders, OOS_eng_orders, OOS_noneng_orders, intl_orders,
#           lists_df_pivot, lists_df_sat, lists_df_act, df_sat_ca_20, df_sat_ca_19

# Add 11 + 12 columns for the SAT test takers datasets
add_testtakers_cols <- function(sat_df) {
  sat_df %>% mutate(
    Enroll1112 = as.numeric(Enroll12) + as.numeric(Enroll11),
    NumTSTTakr1112 = NumTSTTakr11 + NumTSTTakr12,
    NumERWBenchmark1112 = as.numeric(NumERWBenchmark11) + as.numeric(NumERWBenchmark12),
    PctERWBenchmark1112 = as.numeric(NumERWBenchmark1112) / as.numeric(NumTSTTakr1112),
    NumMathBenchmark1112 = as.numeric(NumMathBenchmark11) + as.numeric(NumMathBenchmark12),
    PctMathBenchmark1112 = as.numeric(NumMathBenchmark1112) / as.numeric(NumTSTTakr1112),
    TotNumBothBenchmark1112 = as.numeric(TotNumBothBenchmark11) + as.numeric(TotNumBothBenchmark12),
    PctBothBenchmark1112 = as.numeric(TotNumBothBenchmark1112) / as.numeric(NumTSTTakr1112)
  )
}

df_sat_ca_20 <- add_testtakers_cols(df_sat_ca_20)
df_sat_ca_19 <- add_testtakers_cols(df_sat_ca_19)

1 Illinois purchases (51 orders)

IL_orders

Generally, on each purchase date, they make 6 IL orders by race/ethnicity and test scores:

  • Race/ethnicity groups (A) + Test score range (low A)
  • Race/ethnicity groups (A) + Test score range (med A)
  • Race/ethnicity groups (A) + Test score range (high A)
  • Race/ethnicity groups (B) + Test score range (low B)
  • Race/ethnicity groups (B) + Test score range (med B)
  • Race/ethnicity groups (B) + Test score range (high B)

Group/filter definitions:

  • Race/ethnicity groups (A)
    • Asian (including Indian subcontinent and Philippines origin)|Other|I do not wish to respond to race|No, not of Hispanic, Latino, or Spanish origin|White (including Middle Eastern origin)|Native Hawaiian or Other Pacific Islander
      • Starting from Fall 2019, they removed “Native Hawaiian or Other Pacific Islander” from the med and high orders
      • In the most recent 2 rounds of purchases (6 orders from Dec 2019 & Apr 2020), they further limited geographic filter to 12 specific metro areas within IL
    • Test score range (low A): 1160 - 1270
    • Test score range (med A): 1260/1280 - 1440
    • Test score range (high A): 1450 - 1600
  • Race/ethnicity groups (B)
    • Black or African American|American Indian or Alaska Native|Other Hispanic or Latino|Puerto Rican|Mexican|Hispanic or Latino (including Spanish origin)|Cuban
      • Starting from Fall 2019, they added “Native Hawaiian or Other Pacific Islander” to all orders (low, med, and high)
    • Test score range (low B): 1020/1030 - 1190
    • Test score range (med B): 1200 - 1350/1380
    • Test score range (high B): 1360/1390 - 1600

Common filters:

  • GPA: A+ to B-
  • Class rank: Highest tenth to second fifth

2 Out-of-states purchases (22 orders)

Based on geographic filters, we can categorize their orders into 3 broad categories:

  • Specific MSAs (3 orders)
  • Non-engineering majors in specific MSAs + CA, CT, MO, and Armed Forces military districts (6 orders)
  • Engineering majors in all 49 states except IL + DC (13 orders)

21 of the 22 out-of-state orders also use these segment analysis filters (total possible: 33 neighborhood clusters and 29 high-school clusters):

  • EN:51, HS:65 | EN:51, HS:68 | EN:51, HS:70 | EN:51, HS:79
  • EN:53, HS:65 | EN:53, HS:70 |
  • EN:58, HS:64 | EN:58, HS:65 | EN:58, HS:70
  • EN:60, HS:65 | EN:60, HS:68 | EN:60, HS:70 | EN:60, HS:79
  • EN:61, HS:ALL
  • EN:63, HS:65 | EN:63, HS:70
  • EN:69, HS:68 | EN:69, HS:70 | EN:69, HS:75
  • EN:70, HS:66 | EN:70, HS:68 | EN:70, HS:70 | EN:70, HS:79
  • EN:73, HS:65 | EN:73, HS:70
  • EN:78, HS:ALL

Sample neighborhood clusters (EN) characteristics from 2011:

Sample high school clusters (HS) characteristics from 2011:

2.1 MSA orders (3 orders)

OOS_msa_orders <- OOS_orders %>% filter(order_num %in% c('500590', '567376', '483751'))

OOS_msa_orders

They made 2 “OOS Regional MSA” orders 1 “Regional Counselor MSAs” order.

  • They used the segment analysis from above
  • A+/B- high school GPA
  • MSAs from states: CA, TX, GA, FL, NY, NJ

2.1.1 Student-level MSA specific orders

Number of students purchased in the three orders

OOS_msa_stu %>% count(order_num) %>% arrange(-n)

It appears that 4 students might of been purchased twice

OOS_msa_stu %>%
  group_by(Ref, order_num) %>%
  summarize(n_per_grp = n()) %>%
  ungroup() %>%
  count(n_per_grp)
OOS_msa_stu %>%
  group_by(Ref, order_num) %>%
  summarize(n_per_grp = n()) %>%
  ungroup() %>%
  filter(n_per_grp > 1) #4 students were purchased twice

This table below shows the number of zip codes purchased by descending order

  • Grouped by zip code and summarized to get a count of the number of students purchased within each zip code
  • Merged in census zip-code level data to show demographic data associated with each zip code
OOS_msa_zip %>%
  dplyr::select(zip_code, n_per_grp, median_household_income, starts_with("pct"))

Filtered to the top five zip codes purchased

  • Calculated the race/ethnicity for students purchased by zip code to create percent race/ethnicity variables

Zip code 77494

  • Filtered to the zip code 77494 to compare the racial/ethnic composition of students purchased in this zip code and the racial/ethnic composition of the population of this zip code
OOS_msa_77494_zip <- OOS_msa_top_5_zip %>%
  filter(zip_code == "77494") %>%
  dplyr::select(zip_code, stu_pct_white, pct_white, stu_pct_black, pct_black, stu_pct_asian, pct_asian, stu_pct_latinx, pct_latinx, stu_pct_nhpi, pct_nat_hi, stu_pct_natam, pct_nat_am) %>% head(n=1)

OOS_msa_77494_zip

Zip code 77479

  • Filtered to the zip code 77479 to compare the racial/ethnic composition of students purchased in this zip code and the racial/ethnic composition of the population of this zip code
OOS_msa_77479_zip <- OOS_msa_top_5_zip %>%
  filter(zip_code == "77479") %>%
  dplyr::select(zip_code, stu_pct_white, pct_white, stu_pct_black, pct_black, stu_pct_asian, pct_asian, stu_pct_latinx, pct_latinx, stu_pct_nhpi, pct_nat_hi, stu_pct_natam, pct_nat_am) %>% head(n=1)

OOS_msa_77479_zip

Zip code 94582

  • Filtered to the zip code 94582 to compare the racial/ethnic composition of students purchased in this zip code and the racial/ethnic composition of the population of this zip code
OOS_msa_94582_zip <- OOS_msa_top_5_zip %>%
  filter(zip_code == "94582") %>%
  dplyr::select(zip_code, stu_pct_white, pct_white, stu_pct_black, pct_black, stu_pct_asian, pct_asian, stu_pct_latinx, pct_latinx, stu_pct_nhpi, pct_nat_hi, stu_pct_natam, pct_nat_am) %>% head(n=1)

OOS_msa_94582_zip

Zip code 77382

  • Filtered to the zip code 77382 to compare the racial/ethnic composition of students purchased in this zip code and the racial/ethnic composition of the population of this zip code
OOS_msa_77382_zip <- OOS_msa_top_5_zip %>%
  filter(zip_code == "77382") %>%
  dplyr::select(zip_code, stu_pct_white, pct_white, stu_pct_black, pct_black, stu_pct_asian, pct_asian, stu_pct_latinx, pct_latinx, stu_pct_nhpi, pct_nat_hi, stu_pct_natam, pct_nat_am) %>% head(n=1)

OOS_msa_77382_zip

Zip code 30024

  • Filtered to the zip code 30024 to compare the racial/ethnic composition of students purchased in this zip code and the racial/ethnic composition of the population of this zip code
OOS_msa_30024_zip <- OOS_msa_top_5_zip %>%
  filter(zip_code == "30024") %>%
  dplyr::select(zip_code, stu_pct_white, pct_white, stu_pct_black, pct_black, stu_pct_asian, pct_asian, stu_pct_latinx, pct_latinx, stu_pct_nhpi, pct_nat_hi, stu_pct_natam, pct_nat_am) %>% head(n=1)

OOS_msa_30024_zip

2.2 Non-ENG orders (6 orders)

OOS_noneng_orders

They made 5 “OOS Non-ENG” orders & 1 “OOS ENG and Non-ENG” order between Feb 2018 & Jun 2019 w/ the criteria:

  • Specific MSAs + CA, CT, MO, and Armed Forces military districts
  • Above segment analysis filter for the 5 Non-ENG only orders but not the 1 order that included ENG majors too
  • A+ to B- high school GPA
  • SAT score of 1230/1240 - 1450 or PSAT score of 1220/1230/1240 - 1450
    • The 1 ENG & Non-ENG order had criteria of SAT 1240 - 1450 or PSAT 1220 - 1450

2.3 ENG orders (13 orders)

OOS_eng_orders
# Lower test score criteria for female students
OOS_eng_orders %>% dplyr::select(gender, sat_score_min, sat_score_max, psat_score_min, psat_score_max) %>%
  distinct() %>% arrange(sat_score_min)
# Fewer female students purchased
OOS_eng_orders %>% group_by(gender) %>%
  summarise(num_orders = n(), total_cost = sum(order_cost), total_students = sum(num_students))

They made 6 “OOS ENG Male” orders & 7 “OOS ENG Female” orders between Feb 2018 & Apr 2020 w/ the criteria:

  • All 49 states except IL + DC
  • Above segment analysis filter
  • A+ to B- high school GPA
  • SAT/PSAT score filter which differs by gender

General observations:

  • More purchases made for female students and w/ lower test score criteria, but resulting female engineering majors in list still less than total male students

3 International purchases (4 orders)

intl_orders

They made 4 international orders between May 2018 & June 2019 with the following search criteria:

  • All filter for A+/B- high school GPA
  • SAT/PSAT scores vary by country.
intl_orders %>% dplyr::select(order_title, order_num, num_students, sat_score_min, sat_score_max, psat_score_min, psat_score_max)

4 CA HS SAT EDA

la_zip_codes <- (zip_cbsa_data %>% filter(cbsa_1 == '31080'))$zip_code

# Look at just College Board LA students
lists_df_sat_la <- lists_df_sat %>%
  select(-test_type, -order_num, -order_date) %>% distinct() %>%  # Each student is unique - got rid of duplicates that came from multiple orders
  mutate(
    zip_code = str_pad(str_sub(ZipCode, 1, 5), width = 5, pad = '0', side = 'left'),
    ceeb = str_pad(SchoolCode, width = 6, pad = '0', side = 'left'),
    is_white = as.integer(str_detect(Race, 'White'))
  ) %>% 
  filter(zip_code %in% la_zip_codes)
length(unique(lists_df_sat_la$zip_code))  # 307 zip codes in LA
## [1] 307
length(unique(lists_df_sat_la$ceeb))  # 348 HS in LA
## [1] 348
# Group by HS
lists_df_sat_la_hs <- lists_df_sat_la %>%
  group_by(ceeb) %>% 
  summarise(total_students_purchased = n(), pct_white_purchased = mean(is_white, na.rm = T) * 100) %>% 
  arrange(-total_students_purchased) %>%
  left_join(ceeb_nces, by = 'ceeb') %>% 
  left_join(cds_nces %>% select(ncessch, CDSCode), by = 'ncessch') %>% 
  left_join(hs_data %>% select(ncessch, total_students, pct_white), by = 'ncessch') %>% 
  left_join(df_sat_ca_20, by = c('CDSCode' = 'CDS'))

lists_df_sat_la_hs

Available variables:

  • Student lists
    • total_students_purchased
    • pct_white_purchased
  • NCES HS data
    • total_students
    • pct_white
  • CA DOE data
    • 12th graders: Enroll12 (enrolled), NumTSTTakr12 (# of testtakers), NumERWBenchmark12 (# met english benchmark), NumMathBenchmark12 (# met math benchmark), TotNumBothBenchmark12 (# met both benchmarks)
    • 11th graders: Enroll11, NumTSTTakr11, NumERWBenchmark11, NumMathBenchmark11, TotNumBothBenchmark11
    • 12th + 11th graders: Enroll1112, NumTSTTakr1112, NumERWBenchmark1112, NumMathBenchmark1112, TotNumBothBenchmark1112

Possible analysis:

  • Compare high school’s racial composition & test readiness against actual purchased students
  • Ideally, we’d be able to isolate purchased students by grad year/grade to match with secondary data for the same class of students
    • We weren’t able to do it for Urbana because some orders filtered for multiple graduating classes and there’s no grad_class column in the student-level data - that could be something we can try to request for
    • We’ll also need to update NCES data (e.g., different year’s data, use race variables by grade level) and check crosswalk

Potential questions/concerns:

  • If the characteristics of purchased students (e.g., racial composition, test readiness) do indeed differ from that of the high school for the same cohort/class of students, what are we able to conclude about why that is?
    • We will need to look at the order summary criteria to see what filters were being used
    • In the case of Urbana, the out-of-state orders filtered by geography (state, MSA, segment analysis), SAT/PSAT score ranges, and HS GPA (and sometimes separate orders for gender but both male/female were purchased)
    • In theory, all students who took the SAT who met the test score/GPA criteria and attended a HS that falls within the purchased geography/segment should be on the list
    • If a student from a purchased HS is not on the list, they either did not take the SAT/opt in or did not meet the test/GPA criteria
  • In other words, Urbana could not specifically pick students from the high school (apart from the filter criteria used), but they are specifically picking high schools to purchase through the segment analysis filter
    • It would be ideal to know the characteristics of the EN/HS clusters being purchased as compared to the clusters not purchased
    • We could try to look at purchased student-level data to infer characteristics of purchased EN/HS clusters, but without knowing the nonpurchased students, it would be hard to make comparisons/draw conclusions
    • Ideally we would be able to request the segment analysis info from the University


The table below compares the number of student’s purchased for each high school (according to ncessch ID) and the racial/ethnic identities of those students to the racial/ethnic makeup of the school.

df_la_race <- df_sl_la_race %>%
       group_by(ncessch) %>%
       filter(row_number(ncessch) == 1) %>%
       arrange(-n_stu_nces) %>%
       select(ncessch, n_stu_nces, SName, stu_pct_white, pct_white, stu_pct_black, pct_black ,stu_pct_asian, pct_asian ,stu_pct_latinx, pct_hispanic, stu_pct_natam, pct_amerindian, stu_pct_other, pct_other)

df_la_race

The table below sorts in descending order the number of students purchased by high school (according to ncessch ID) and shows the number of 11th and 12th graders enrolled (2018-2019 academic year), number of 11th & 12th grade test takers, and the number and percentage of test takers that met the benchmakrs for ERW and Math for the SAT.

  • 11th graders in 2018-2019 are graduating class of 2020
  • 12th graders in 2018-2019 are graduating class of 2019
  • SAT benchmark for ERW is score of 480 and benchmark for Math is score of 530.
  • For 11th graders benchmark for ERW is 460 and benchmark for Math is 510.
df_la_county_per <- df_la_county %>%
  group_by(ncessch) %>%
  filter(row_number(ncessch) == 1, !is.na(CDS)) %>%
  select(ncessch, SName, n_stu_nces, Enroll1112, NumTSTTakr1112, NumERWBenchmark1112, PctERWBenchmark1112, NumMathBenchmark1112, PctMathBenchmark1112, TotNumBothBenchmark11, PctBothBenchmark1112) %>% 
  arrange(-n_stu_nces)

df_la_county_per

Comments

  • The CA Department of Education separates SAT scores for ERW and Math, while order summaries combine both to filter for total SAT score.
  • Might be tricky to identify what SAT ranges universities’ filter for since we are comparing at the high school-level and separate orders could filter for different SAT ranges (e.g., students purchased from same high school may come from separate orders.)
    • Could perhaps identify a range for SAT filter a university uses since that usually doesn’t vary much.

5 Zip code level analysis

5.1 By zip code

5.2 By HS aggregated to zip code level

lists_df_all <- lists_df_pivot %>%  # 434120 unique students
  filter(Country == 'United States') %>% 
  select(-test_type, -order_num, -order_date) %>% distinct() %>%  # Each student is unique - got rid of duplicates that came from multiple orders
  mutate(
    ceeb = str_pad(SchoolCode, width = 6, pad = '0', side = 'left')
  )

ceeb_hs <- ceeb_nces %>% inner_join(hs_data, by = 'ncessch')  # get rid of rows w/o NCES data too

lists_df_all_hs <- lists_df_all %>% left_join(ceeb_hs, by = 'ceeb')  # 476 repeated rows (matched to one of the 46 ceeb that had multiple ncessch entries in crosswalk)

# 322211 (80.7%) from public HS, 47544 (11.9%) from private HS, 29525 (7.4%) either no entry in crosswalk or no available NCES data
table(lists_df_all_hs$school_type, useNA = 'always')
## 
## Private  Public    <NA> 
##   47544  322211   29525
lists_df_all_zip <- lists_df_all %>% right_join(ceeb_hs, by = 'ceeb')
# dropping NA students who did not match to available NCES data
# this also dropped HS whose ncessch did not exist in crosswalk (no chance of it merging w/ purchased students if there were any)

lists_df_all_zip %>% count(is.na(Ref))  # 369755 purchased students
length(unique(lists_df_all_zip$ncessch))  # 19986 HS
## [1] 19986
# Group by HS's zip code
lists_df_all_zip_agg <- lists_df_all_zip %>%
  mutate(
    ncessch_purchased = if_else(is.na(Ref), NA_character_, ncessch)
  ) %>% 
  group_by(zip_code, state_code) %>%
  summarise(
    num_hs = n_distinct(ncessch, na.rm = T),
    num_hs_purchased = n_distinct(ncessch_purchased, na.rm = T),
    num_students_purchased = sum(as.numeric(!is.na(Ref)))
  ) %>%
  arrange(desc(num_students_purchased))

lists_df_all_zip_agg
sum(lists_df_all_zip_agg$num_students_purchased)  # 369755 purchased students
## [1] 369755
sum(lists_df_all_zip_agg$num_hs)  # 19986 HS
## [1] 19986
sum(lists_df_all_zip_agg$num_hs_purchased)  # 6299 purchased HS
## [1] 6299
# Filter for just schools (that's in crosswalk file) in LA zip codes (506 HS)
lists_df_all_zip_agg %>%filter(zip_code %in% la_zip_codes)
lists_df_all_la_zip <- lists_df_all_zip %>% filter(zip_code %in% la_zip_codes)

lists_df_all_la_zip %>% count(is.na(Ref))  # 18470 purchased students
length(unique(lists_df_all_la_zip$ncessch))  # 506 HS
## [1] 506
lists_df_all_la_zip_by_purchase <- lists_df_all_la_zip %>%
  group_by(zip_code, state_code, ncessch, total_students, pct_white, pct_poc) %>% 
  summarise(
    num_students_purchased = sum(!is.na(Ref))
  ) %>% 
  mutate(is_hs_purchased = num_students_purchased > 0) %>%
  group_by(zip_code, state_code, is_hs_purchased) %>%
  summarise(
    num_hs = n(),
    num_students_purchased = sum(num_students_purchased),
    avg_hs_size = mean(total_students),
    avg_pct_white = mean(pct_white),
    avg_pct_poc = mean(pct_poc),
    avg_pct_white_weighted = sum(total_students / sum(total_students) * pct_white),
    avg_pct_poc_weighted = sum(total_students / sum(total_students) * pct_poc)
  ) %>% 
  arrange(zip_code, desc(is_hs_purchased))

lists_df_all_la_zip_by_purchase
sum(lists_df_all_la_zip_by_purchase$num_students_purchased)  # 18470 purchased students
## [1] 18470
sum(lists_df_all_la_zip_by_purchase$num_hs)  # 506 HS
## [1] 506
lists_df_all_la_zip_by_purchase %>% group_by(is_hs_purchased) %>% summarise(count = sum(num_hs))  # 230 HS purchased (276 HS not purchased)

Notes:

  • We would ideally know the HS/EN clusters to differentiate between HS that weren’t purchased vs. HS that were purchased but no students met other criteria
  • CEEB/NCES crosswalk